Call in libraries
library(lubridate)
Attaching package: ‘lubridate’
The following objects are masked from ‘package:base’:
date, intersect, setdiff, union
Read in data set
avocado <- read_csv("data/avocado.csv") %>% janitor::clean_names()
New names:
• `` -> `...1`
Rows: 18249 Columns: 14
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): type, region
dbl (11): ...1, AveragePrice, Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, XLarge Bags, year
date (1): Date
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Use glimpse
glimpse(avocado)
Rows: 18,249
Columns: 14
$ x1 <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,…
$ date <date> 2015-12-27, 2015-12-20, 2015-12-13, 2015-12-06, 2015-11-29, 2015-11-22, 2015-11-15, 2015-11-08, 2015-11-01, …
$ average_price <dbl> 1.33, 1.35, 0.93, 1.08, 1.28, 1.26, 0.99, 0.98, 1.02, 1.07, 1.12, 1.28, 1.31, 0.99, 1.33, 1.28, 1.11, 1.07, 1…
$ total_volume <dbl> 64236.62, 54876.98, 118220.22, 78992.15, 51039.60, 55979.78, 83453.76, 109428.33, 99811.42, 74338.76, 84843.4…
$ x4046 <dbl> 1036.74, 674.28, 794.70, 1132.00, 941.48, 1184.27, 1368.92, 703.75, 1022.15, 842.40, 924.86, 1582.03, 2268.32…
$ x4225 <dbl> 54454.85, 44638.81, 109149.67, 71976.41, 43838.39, 48067.99, 73672.72, 101815.36, 87315.57, 64757.44, 75595.8…
$ x4770 <dbl> 48.16, 58.33, 130.50, 72.58, 75.78, 43.61, 93.26, 80.00, 85.34, 113.00, 117.07, 105.32, 101.36, 154.84, 150.5…
$ total_bags <dbl> 8696.87, 9505.56, 8145.35, 5811.16, 6183.95, 6683.91, 8318.86, 6829.22, 11388.36, 8625.92, 8205.66, 10123.90,…
$ small_bags <dbl> 8603.62, 9408.07, 8042.21, 5677.40, 5986.26, 6556.47, 8196.81, 6266.85, 11104.53, 8061.47, 7877.86, 9866.27, …
$ large_bags <dbl> 93.25, 97.49, 103.14, 133.76, 197.69, 127.44, 122.05, 562.37, 283.83, 564.45, 327.80, 257.63, 376.77, 145.59,…
$ x_large_bags <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0…
$ type <chr> "conventional", "conventional", "conventional", "conventional", "conventional", "conventional", "conventional…
$ year <dbl> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2…
$ region <chr> "Albany", "Albany", "Albany", "Albany", "Albany", "Albany", "Albany", "Albany", "Albany", "Albany", "Albany",…
Use skim
library(skimr)
skim(avocado)
── Data Summary ────────────────────────
Values
Name avocado
Number of rows 18249
Number of columns 14
_______________________
Column type frequency:
character 2
Date 1
numeric 11
________________________
Group variables None
Look at first six rows
avocado %>%
head()
Look at distribution of average prices
avocado %>%
ggplot(aes(x = average_price)) +
geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Find if there are any aliases
alias(average_price ~ .,
data = avocado)
Model :
average_price ~ x1 + date + total_volume + x4046 + x4225 + x4770 +
total_bags + small_bags + large_bags + x_large_bags + type +
year + region
Find all the distinct regions
avocado %>%
distinct(region)
Feature Engineering
datetotal_single column (total_volume =
total_single + total_bags)year to a factorregion into groupstype to logicalavocado <- avocado %>%
mutate(month = month(date, label = TRUE, abbr = FALSE)) %>%
mutate(total_single = x4046 + x4225 + x4770,
.after = total_volume) %>%
mutate(year = as.factor(year)) %>%
mutate(region = case_when(
str_detect(region, "Albany") ~ "Northeast",
str_detect(region, "Atlanta") ~ "Southeast",
str_detect(region, "BaltimoreWashington") ~ "Northeast",
str_detect(region, "Boise") ~ "West",
str_detect(region, "Boston") ~ "Northeast",
str_detect(region, "BuffaloRochester") ~ "Northeast",
str_detect(region, "California") ~ "California",
str_detect(region, "Charlotte") ~ "Midsouth",
str_detect(region, "Chicago") ~ "Great Lakes",
str_detect(region, "CincinnatiDayton") ~ "Great Lakes",
str_detect(region, "Columbus") ~ "Great Lakes",
str_detect(region, "DallasFtWorth") ~ "South Central",
str_detect(region, "Denver") ~ "West",
str_detect(region, "Detroit") ~ "Great Lakes",
str_detect(region, "GrandRapids") ~ "Great Lakes",
str_detect(region, "GreatLakes") ~ "Great Lakes",
str_detect(region, "HarrisburgScranton") ~ "Northeast",
str_detect(region, "Houston") ~ "South Central",
str_detect(region, "HartfordSpringfield") ~ "Northeast",
str_detect(region, "Indianapolis") ~ "Great Lakes",
str_detect(region, "Jacksonville") ~ "Southeast",
str_detect(region, "LasVegas") ~ "West",
str_detect(region, "LosAngeles") ~ "California",
str_detect(region, "Louisville") ~ "Midsouth",
str_detect(region, "MiamiFtLauderdale") ~ "Southeast",
str_detect(region, "Midsouth") ~ "Midsouth",
str_detect(region, "Nashville") ~ "Midsouth",
str_detect(region, "NewOrleansMobile") ~ "South Central",
str_detect(region, "NewYork") ~ "Northeast",
str_detect(region, "Northeast") ~ "Northeast",
str_detect(region, "NorthernNewEngland") ~ "Northeast",
str_detect(region, "Orlando") ~ "Southeast",
str_detect(region, "Philadelphia") ~ "Northeast",
str_detect(region, "PhoenixTucson") ~ "West",
str_detect(region, "Pittsburgh") ~ "Northeast",
str_detect(region, "Plains") ~ "Plains",
str_detect(region, "Portland") ~ "West",
str_detect(region, "RaleighGreensboro") ~ "Midsouth",
str_detect(region, "RichmondNorfolk") ~ "Midsouth",
str_detect(region, "Roanoke") ~ "Midsouth",
str_detect(region, "Sacramento") ~ "California",
str_detect(region, "SanDiego") ~ "California",
str_detect(region, "SanFrancisco") ~ "California",
str_detect(region, "Seattle") ~ "West",
str_detect(region, "SouthCarolina") ~ "Southeast",
str_detect(region, "SouthCentral") ~ "South Central",
str_detect(region, "Southeast") ~ "Southeast",
str_detect(region, "Spokane") ~ "West",
str_detect(region, "StLouis") ~ "Plains",
str_detect(region, "Syracuse") ~ "Northeast",
str_detect(region, "Tampa") ~ "Southeast",
str_detect(region, "TotalUS") ~ "TotalUS",
str_detect(region, "West") ~ "West",
str_detect(region, "WestTexNewMexico") ~ "South Central",
TRUE ~ region
)) %>%
mutate(is_organic = type == "organic")
Feature Engineering cont
avocado <- avocado %>%
select(average_price, total_single, total_bags, is_organic, year, region, month)
Further feature engineering:
ln_total_singleln_total_bagsprop_single_bagsSplit the data into test/train data sets (90/10)
n_row
[1] 18249
Initial ggpairs
avocado_train %>%
select(average_price, everything()) %>%
ggpairs()
plot: [1,1] [>-----------------------------------------------------------------------------------------------------] 1% est: 0s
plot: [1,2] [=>----------------------------------------------------------------------------------------------------] 2% est: 4s
plot: [1,3] [==>---------------------------------------------------------------------------------------------------] 3% est: 4s
plot: [1,4] [===>--------------------------------------------------------------------------------------------------] 4% est: 4s
plot: [1,5] [====>-------------------------------------------------------------------------------------------------] 5% est: 4s
plot: [1,6] [=====>------------------------------------------------------------------------------------------------] 6% est: 5s
plot: [1,7] [======>-----------------------------------------------------------------------------------------------] 7% est: 6s
plot: [1,8] [=======>----------------------------------------------------------------------------------------------] 8% est: 6s
plot: [1,9] [========>---------------------------------------------------------------------------------------------] 9% est: 6s
plot: [1,10] [=========>-------------------------------------------------------------------------------------------] 10% est: 6s
plot: [2,1] [==========>-------------------------------------------------------------------------------------------] 11% est: 6s
plot: [2,2] [===========>------------------------------------------------------------------------------------------] 12% est: 6s
plot: [2,3] [============>-----------------------------------------------------------------------------------------] 13% est: 6s
plot: [2,4] [=============>----------------------------------------------------------------------------------------] 14% est: 5s
plot: [2,5] [==============>---------------------------------------------------------------------------------------] 15% est: 5s
plot: [2,6] [===============>--------------------------------------------------------------------------------------] 16% est: 5s
plot: [2,7] [================>-------------------------------------------------------------------------------------] 17% est: 5s
plot: [2,8] [=================>------------------------------------------------------------------------------------] 18% est: 5s
plot: [2,9] [==================>-----------------------------------------------------------------------------------] 19% est: 5s
plot: [2,10] [===================>---------------------------------------------------------------------------------] 20% est: 5s
plot: [3,1] [====================>---------------------------------------------------------------------------------] 21% est: 5s
plot: [3,2] [=====================>--------------------------------------------------------------------------------] 22% est: 5s
plot: [3,3] [======================>-------------------------------------------------------------------------------] 23% est: 5s
plot: [3,4] [=======================>------------------------------------------------------------------------------] 24% est: 5s
plot: [3,5] [=========================>----------------------------------------------------------------------------] 25% est: 4s
plot: [3,6] [==========================>---------------------------------------------------------------------------] 26% est: 5s
plot: [3,7] [===========================>--------------------------------------------------------------------------] 27% est: 5s
plot: [3,8] [============================>-------------------------------------------------------------------------] 28% est: 5s
plot: [3,9] [=============================>------------------------------------------------------------------------] 29% est: 5s
plot: [3,10] [=============================>-----------------------------------------------------------------------] 30% est: 4s
plot: [4,1] [===============================>----------------------------------------------------------------------] 31% est: 4s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [4,2] [================================>---------------------------------------------------------------------] 32% est: 4s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [4,3] [=================================>--------------------------------------------------------------------] 33% est: 4s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [4,4] [==================================>-------------------------------------------------------------------] 34% est: 4s
plot: [4,5] [===================================>------------------------------------------------------------------] 35% est: 4s
plot: [4,6] [====================================>-----------------------------------------------------------------] 36% est: 4s
plot: [4,7] [=====================================>----------------------------------------------------------------] 37% est: 4s
plot: [4,8] [======================================>---------------------------------------------------------------] 38% est: 4s
plot: [4,9] [=======================================>--------------------------------------------------------------] 39% est: 4s
plot: [4,10] [=======================================>-------------------------------------------------------------] 40% est: 4s
plot: [5,1] [=========================================>------------------------------------------------------------] 41% est: 4s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [5,2] [==========================================>-----------------------------------------------------------] 42% est: 4s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [5,3] [===========================================>----------------------------------------------------------] 43% est: 4s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [5,4] [============================================>---------------------------------------------------------] 44% est: 4s
plot: [5,5] [=============================================>--------------------------------------------------------] 45% est: 4s
plot: [5,6] [==============================================>-------------------------------------------------------] 46% est: 4s
plot: [5,7] [===============================================>------------------------------------------------------] 47% est: 4s
plot: [5,8] [================================================>-----------------------------------------------------] 48% est: 4s
plot: [5,9] [=================================================>----------------------------------------------------] 49% est: 4s
plot: [5,10] [=================================================>---------------------------------------------------] 50% est: 4s
plot: [6,1] [===================================================>--------------------------------------------------] 51% est: 4s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [6,2] [====================================================>-------------------------------------------------] 52% est: 4s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [6,3] [=====================================================>------------------------------------------------] 53% est: 4s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [6,4] [======================================================>-----------------------------------------------] 54% est: 4s
plot: [6,5] [=======================================================>----------------------------------------------] 55% est: 4s
plot: [6,6] [========================================================>---------------------------------------------] 56% est: 4s
plot: [6,7] [=========================================================>--------------------------------------------] 57% est: 4s
plot: [6,8] [==========================================================>-------------------------------------------] 58% est: 4s
plot: [6,9] [===========================================================>------------------------------------------] 59% est: 4s
plot: [6,10] [============================================================>----------------------------------------] 60% est: 4s
plot: [7,1] [=============================================================>----------------------------------------] 61% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [7,2] [==============================================================>---------------------------------------] 62% est: 4s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [7,3] [===============================================================>--------------------------------------] 63% est: 4s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [7,4] [================================================================>-------------------------------------] 64% est: 4s
plot: [7,5] [=================================================================>------------------------------------] 65% est: 4s
plot: [7,6] [==================================================================>-----------------------------------] 66% est: 4s
plot: [7,7] [===================================================================>----------------------------------] 67% est: 4s
plot: [7,8] [====================================================================>---------------------------------] 68% est: 4s
plot: [7,9] [=====================================================================>--------------------------------] 69% est: 3s
plot: [7,10] [======================================================================>------------------------------] 70% est: 3s
plot: [8,1] [=======================================================================>------------------------------] 71% est: 3s
plot: [8,2] [========================================================================>-----------------------------] 72% est: 3s
plot: [8,3] [=========================================================================>----------------------------] 73% est: 3s
plot: [8,4] [==========================================================================>---------------------------] 74% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [8,5] [===========================================================================>--------------------------] 75% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [8,6] [=============================================================================>------------------------] 76% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [8,7] [==============================================================================>-----------------------] 77% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [8,8] [===============================================================================>----------------------] 78% est: 3s
plot: [8,9] [================================================================================>---------------------] 79% est: 3s
plot: [8,10] [================================================================================>--------------------] 80% est: 2s
plot: [9,1] [==================================================================================>-------------------] 81% est: 2s
plot: [9,2] [===================================================================================>------------------] 82% est: 2s
plot: [9,3] [====================================================================================>-----------------] 83% est: 2s
plot: [9,4] [=====================================================================================>----------------] 84% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [9,5] [======================================================================================>---------------] 85% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [9,6] [=======================================================================================>--------------] 86% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [9,7] [========================================================================================>-------------] 87% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [9,8] [=========================================================================================>------------] 88% est: 1s
plot: [9,9] [==========================================================================================>-----------] 89% est: 1s
plot: [9,10] [==========================================================================================>----------] 90% est: 1s
plot: [10,1] [===========================================================================================>---------] 91% est: 1s
plot: [10,2] [============================================================================================>--------] 92% est: 1s
plot: [10,3] [=============================================================================================>-------] 93% est: 1s
plot: [10,4] [==============================================================================================>------] 94% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [10,5] [===============================================================================================>-----] 95% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [10,6] [================================================================================================>----] 96% est: 0s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [10,7] [=================================================================================================>---] 97% est: 0s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [10,8] [==================================================================================================>--] 98% est: 0s
plot: [10,9] [===================================================================================================>-] 99% est: 0s
plot: [10,10] [====================================================================================================]100% est: 0s
Based on this, look at the following for the first predictor:
ln_total_bagis_organicPlot ln_total_bag histogram
avocado_train %>%
ggplot(aes(x = ln_total_bags))+
geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Plot is_organic boxplot
avocado_train %>%
ggplot(aes(x = is_organic,
y = average_price))+
geom_boxplot()
Create first model
mod1a <- lm(average_price ~ ln_total_bags,
data = avocado_train)
autoplot(mod1a)
summary(mod1a)
Call:
lm(formula = average_price ~ ln_total_bags, data = avocado_train)
Residuals:
Min 1Q Median 3Q Max
-1.0395 -0.2346 -0.0377 0.1976 1.6295
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.448530 0.011805 207.41 <2e-16 ***
ln_total_bags -0.102457 0.001131 -90.57 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3276 on 16423 degrees of freedom
Multiple R-squared: 0.3331, Adjusted R-squared: 0.3331
F-statistic: 8203 on 1 and 16423 DF, p-value: < 2.2e-16
Results:
mod1b <- lm(average_price ~ is_organic,
data = avocado_train)
autoplot(mod1b)
summary(mod1b)
Call:
lm(formula = average_price ~ is_organic, data = avocado_train)
Residuals:
Min 1Q Median 3Q Max
-1.21288 -0.19668 -0.02668 0.18332 1.59712
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.156682 0.003478 332.6 <2e-16 ***
is_organicTRUE 0.496196 0.004920 100.9 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3152 on 16423 degrees of freedom
Multiple R-squared: 0.3825, Adjusted R-squared: 0.3825
F-statistic: 1.017e+04 on 1 and 16423 DF, p-value: < 2.2e-16
Results:
Look at and plot residuals:
avocado_resid <- avocado_train %>%
add_residuals(mod1b) %>%
select(-c(average_price, is_organic))
avocado_resid %>%
select(resid, everything()) %>%
ggpairs()
plot: [1,1] [>-----------------------------------------------------------------------------------------------------] 1% est: 0s
plot: [1,2] [==>---------------------------------------------------------------------------------------------------] 2% est: 3s
plot: [1,3] [===>--------------------------------------------------------------------------------------------------] 4% est: 3s
plot: [1,4] [====>-------------------------------------------------------------------------------------------------] 5% est: 3s
plot: [1,5] [=====>------------------------------------------------------------------------------------------------] 6% est: 3s
plot: [1,6] [=======>----------------------------------------------------------------------------------------------] 7% est: 4s
plot: [1,7] [========>---------------------------------------------------------------------------------------------] 9% est: 5s
plot: [1,8] [=========>--------------------------------------------------------------------------------------------] 10% est: 4s
plot: [1,9] [==========>-------------------------------------------------------------------------------------------] 11% est: 4s
plot: [2,1] [============>-----------------------------------------------------------------------------------------] 12% est: 4s
plot: [2,2] [=============>----------------------------------------------------------------------------------------] 14% est: 4s
plot: [2,3] [==============>---------------------------------------------------------------------------------------] 15% est: 4s
plot: [2,4] [===============>--------------------------------------------------------------------------------------] 16% est: 4s
plot: [2,5] [=================>------------------------------------------------------------------------------------] 17% est: 4s
plot: [2,6] [==================>-----------------------------------------------------------------------------------] 19% est: 4s
plot: [2,7] [===================>----------------------------------------------------------------------------------] 20% est: 4s
plot: [2,8] [====================>---------------------------------------------------------------------------------] 21% est: 4s
plot: [2,9] [======================>-------------------------------------------------------------------------------] 22% est: 4s
plot: [3,1] [=======================>------------------------------------------------------------------------------] 23% est: 4s
plot: [3,2] [========================>-----------------------------------------------------------------------------] 25% est: 4s
plot: [3,3] [=========================>----------------------------------------------------------------------------] 26% est: 4s
plot: [3,4] [===========================>--------------------------------------------------------------------------] 27% est: 3s
plot: [3,5] [============================>-------------------------------------------------------------------------] 28% est: 3s
plot: [3,6] [=============================>------------------------------------------------------------------------] 30% est: 3s
plot: [3,7] [==============================>-----------------------------------------------------------------------] 31% est: 3s
plot: [3,8] [================================>---------------------------------------------------------------------] 32% est: 3s
plot: [3,9] [=================================>--------------------------------------------------------------------] 33% est: 3s
plot: [4,1] [==================================>-------------------------------------------------------------------] 35% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [4,2] [====================================>-----------------------------------------------------------------] 36% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [4,3] [=====================================>----------------------------------------------------------------] 37% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [4,4] [======================================>---------------------------------------------------------------] 38% est: 4s
plot: [4,5] [=======================================>--------------------------------------------------------------] 40% est: 3s
plot: [4,6] [=========================================>------------------------------------------------------------] 41% est: 3s
plot: [4,7] [==========================================>-----------------------------------------------------------] 42% est: 3s
plot: [4,8] [===========================================>----------------------------------------------------------] 43% est: 3s
plot: [4,9] [============================================>---------------------------------------------------------] 44% est: 3s
plot: [5,1] [==============================================>-------------------------------------------------------] 46% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [5,2] [===============================================>------------------------------------------------------] 47% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [5,3] [================================================>-----------------------------------------------------] 48% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [5,4] [=================================================>----------------------------------------------------] 49% est: 4s
plot: [5,5] [===================================================>--------------------------------------------------] 51% est: 4s
plot: [5,6] [====================================================>-------------------------------------------------] 52% est: 3s
plot: [5,7] [=====================================================>------------------------------------------------] 53% est: 3s
plot: [5,8] [======================================================>-----------------------------------------------] 54% est: 3s
plot: [5,9] [========================================================>---------------------------------------------] 56% est: 3s
plot: [6,1] [=========================================================>--------------------------------------------] 57% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [6,2] [==========================================================>-------------------------------------------] 58% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [6,3] [===========================================================>------------------------------------------] 59% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [6,4] [=============================================================>----------------------------------------] 60% est: 3s
plot: [6,5] [==============================================================>---------------------------------------] 62% est: 3s
plot: [6,6] [===============================================================>--------------------------------------] 63% est: 3s
plot: [6,7] [================================================================>-------------------------------------] 64% est: 3s
plot: [6,8] [==================================================================>-----------------------------------] 65% est: 3s
plot: [6,9] [===================================================================>----------------------------------] 67% est: 3s
plot: [7,1] [====================================================================>---------------------------------] 68% est: 3s
plot: [7,2] [======================================================================>-------------------------------] 69% est: 3s
plot: [7,3] [=======================================================================>------------------------------] 70% est: 3s
plot: [7,4] [========================================================================>-----------------------------] 72% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [7,5] [=========================================================================>----------------------------] 73% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [7,6] [===========================================================================>--------------------------] 74% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [7,7] [============================================================================>-------------------------] 75% est: 2s
plot: [7,8] [=============================================================================>------------------------] 77% est: 2s
plot: [7,9] [==============================================================================>-----------------------] 78% est: 2s
plot: [8,1] [================================================================================>---------------------] 79% est: 2s
plot: [8,2] [=================================================================================>--------------------] 80% est: 2s
plot: [8,3] [==================================================================================>-------------------] 81% est: 2s
plot: [8,4] [===================================================================================>------------------] 83% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [8,5] [=====================================================================================>----------------] 84% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [8,6] [======================================================================================>---------------] 85% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [8,7] [=======================================================================================>--------------] 86% est: 1s
plot: [8,8] [========================================================================================>-------------] 88% est: 1s
plot: [8,9] [==========================================================================================>-----------] 89% est: 1s
plot: [9,1] [===========================================================================================>----------] 90% est: 1s
plot: [9,2] [============================================================================================>---------] 91% est: 1s
plot: [9,3] [=============================================================================================>--------] 93% est: 1s
plot: [9,4] [===============================================================================================>------] 94% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [9,5] [================================================================================================>-----] 95% est: 0s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [9,6] [=================================================================================================>----] 96% est: 0s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [9,7] [==================================================================================================>---] 98% est: 0s
plot: [9,8] [====================================================================================================>-] 99% est: 0s
plot: [9,9] [======================================================================================================]100% est: 0s
Based on this, look at the following for the second predictor:
ln_total_bagregionmonthPlot region boxplot
avocado_train %>%
ggplot(aes(x = region,
y = average_price))+
geom_boxplot()
Plot the month boxplot
avocado_train %>%
ggplot(aes(x = month,
y = average_price))+
geom_boxplot()
mod2a <- lm(average_price ~ is_organic + ln_total_bags,
data = avocado_train)
autoplot(mod2a)
summary(mod2a)
Call:
lm(formula = average_price ~ is_organic + ln_total_bags, data = avocado_train)
Residuals:
Min 1Q Median 3Q Max
-1.09530 -0.20351 -0.02322 0.18008 1.57438
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.727684 0.018422 93.79 <2e-16 ***
is_organicTRUE 0.338227 0.006923 48.86 <2e-16 ***
ln_total_bags -0.048296 0.001532 -31.53 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3061 on 16422 degrees of freedom
Multiple R-squared: 0.4178, Adjusted R-squared: 0.4177
F-statistic: 5891 on 2 and 16422 DF, p-value: < 2.2e-16
Results:
mod2b <- lm(average_price ~ is_organic + region,
data = avocado_train)
autoplot(mod2b)
summary(mod2b)
Call:
lm(formula = average_price ~ is_organic + region, data = avocado_train)
Residuals:
Min 1Q Median 3Q Max
-1.15982 -0.18145 -0.02334 0.15423 1.51673
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.236815 0.007845 157.655 < 2e-16 ***
is_organicTRUE 0.496453 0.004571 108.612 < 2e-16 ***
regionGreat Lakes -0.133450 0.009831 -13.575 < 2e-16 ***
regionMidsouth -0.113476 0.009827 -11.547 < 2e-16 ***
regionNortheast 0.078179 0.008930 8.755 < 2e-16 ***
regionPlains -0.052484 0.014043 -3.737 0.000187 ***
regionSouth Central -0.353895 0.011244 -31.475 < 2e-16 ***
regionSoutheast -0.057496 0.009842 -5.842 5.27e-09 ***
regionTotalUS -0.163394 0.018349 -8.905 < 2e-16 ***
regionWest -0.162423 0.009373 -17.330 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2929 on 16415 degrees of freedom
Multiple R-squared: 0.4672, Adjusted R-squared: 0.4669
F-statistic: 1599 on 9 and 16415 DF, p-value: < 2.2e-16
Results:
mod2c <- lm(average_price ~ is_organic + month,
data = avocado_train)
autoplot(mod2c)
summary(mod2c)
Call:
lm(formula = average_price ~ is_organic + month, data = avocado_train)
Residuals:
Min 1Q Median 3Q Max
-1.1497 -0.1956 -0.0206 0.1868 1.5494
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.164277 0.003319 350.812 < 2e-16 ***
is_organicTRUE 0.496363 0.004676 106.144 < 2e-16 ***
month.L 0.212495 0.007952 26.721 < 2e-16 ***
month.Q -0.152996 0.008071 -18.957 < 2e-16 ***
month.C -0.189055 0.008083 -23.391 < 2e-16 ***
month^4 -0.073337 0.008020 -9.145 < 2e-16 ***
month^5 -0.006010 0.008131 -0.739 0.459796
month^6 0.060764 0.008248 7.367 1.83e-13 ***
month^7 -0.001815 0.008065 -0.225 0.821930
month^8 0.031413 0.008105 3.876 0.000107 ***
month^9 0.014378 0.008340 1.724 0.084731 .
month^10 -0.025341 0.008375 -3.026 0.002485 **
month^11 0.012087 0.008410 1.437 0.150667
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2996 on 16412 degrees of freedom
Multiple R-squared: 0.4426, Adjusted R-squared: 0.4421
F-statistic: 1086 on 12 and 16412 DF, p-value: < 2.2e-16
Results:
Plot the residuals
avocado_resid <- avocado_train %>%
add_residuals(mod2b) %>%
select(-c(average_price, is_organic, region))
avocado_resid %>%
select(resid, everything()) %>%
ggpairs()
plot: [1,1] [=>----------------------------------------------------------------------------------------------------] 2% est: 0s
plot: [1,2] [==>---------------------------------------------------------------------------------------------------] 3% est: 2s
plot: [1,3] [====>-------------------------------------------------------------------------------------------------] 5% est: 3s
plot: [1,4] [=====>------------------------------------------------------------------------------------------------] 6% est: 3s
plot: [1,5] [=======>----------------------------------------------------------------------------------------------] 8% est: 4s
plot: [1,6] [=========>--------------------------------------------------------------------------------------------] 9% est: 5s
plot: [1,7] [==========>-------------------------------------------------------------------------------------------] 11% est: 4s
plot: [1,8] [============>-----------------------------------------------------------------------------------------] 12% est: 4s
plot: [2,1] [=============>----------------------------------------------------------------------------------------] 14% est: 4s
plot: [2,2] [===============>--------------------------------------------------------------------------------------] 16% est: 4s
plot: [2,3] [=================>------------------------------------------------------------------------------------] 17% est: 4s
plot: [2,4] [==================>-----------------------------------------------------------------------------------] 19% est: 3s
plot: [2,5] [====================>---------------------------------------------------------------------------------] 20% est: 4s
plot: [2,6] [=====================>--------------------------------------------------------------------------------] 22% est: 4s
plot: [2,7] [=======================>------------------------------------------------------------------------------] 23% est: 4s
plot: [2,8] [=========================>----------------------------------------------------------------------------] 25% est: 3s
plot: [3,1] [==========================>---------------------------------------------------------------------------] 27% est: 3s
plot: [3,2] [============================>-------------------------------------------------------------------------] 28% est: 3s
plot: [3,3] [=============================>------------------------------------------------------------------------] 30% est: 3s
plot: [3,4] [===============================>----------------------------------------------------------------------] 31% est: 3s
plot: [3,5] [================================>---------------------------------------------------------------------] 33% est: 3s
plot: [3,6] [==================================>-------------------------------------------------------------------] 34% est: 3s
plot: [3,7] [====================================>-----------------------------------------------------------------] 36% est: 3s
plot: [3,8] [=====================================>----------------------------------------------------------------] 38% est: 3s
plot: [4,1] [=======================================>--------------------------------------------------------------] 39% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [4,2] [========================================>-------------------------------------------------------------] 41% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [4,3] [==========================================>-----------------------------------------------------------] 42% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [4,4] [============================================>---------------------------------------------------------] 44% est: 3s
plot: [4,5] [=============================================>--------------------------------------------------------] 45% est: 3s
plot: [4,6] [===============================================>------------------------------------------------------] 47% est: 3s
plot: [4,7] [================================================>-----------------------------------------------------] 48% est: 3s
plot: [4,8] [==================================================>---------------------------------------------------] 50% est: 2s
plot: [5,1] [====================================================>-------------------------------------------------] 52% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [5,2] [=====================================================>------------------------------------------------] 53% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [5,3] [=======================================================>----------------------------------------------] 55% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [5,4] [========================================================>---------------------------------------------] 56% est: 3s
plot: [5,5] [==========================================================>-------------------------------------------] 58% est: 3s
plot: [5,6] [============================================================>-----------------------------------------] 59% est: 3s
plot: [5,7] [=============================================================>----------------------------------------] 61% est: 3s
plot: [5,8] [===============================================================>--------------------------------------] 62% est: 3s Warning in ifelse(x >= 0, x, max + 1 + x) :
restarting interrupted promise evaluation
plot: [6,1] [================================================================>-------------------------------------] 64% est: 2s
plot: [6,2] [==================================================================>-----------------------------------] 66% est: 2s
plot: [6,3] [====================================================================>---------------------------------] 67% est: 2s
plot: [6,4] [=====================================================================>--------------------------------] 69% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [6,5] [=======================================================================>------------------------------] 70% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [6,6] [========================================================================>-----------------------------] 72% est: 2s
plot: [6,7] [==========================================================================>---------------------------] 73% est: 2s
plot: [6,8] [===========================================================================>--------------------------] 75% est: 2s
plot: [7,1] [=============================================================================>------------------------] 77% est: 2s
plot: [7,2] [===============================================================================>----------------------] 78% est: 2s
plot: [7,3] [================================================================================>---------------------] 80% est: 1s
plot: [7,4] [==================================================================================>-------------------] 81% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [7,5] [===================================================================================>------------------] 83% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [7,6] [=====================================================================================>----------------] 84% est: 1s
plot: [7,7] [=======================================================================================>--------------] 86% est: 1s
plot: [7,8] [========================================================================================>-------------] 88% est: 1s
plot: [8,1] [==========================================================================================>-----------] 89% est: 1s
plot: [8,2] [===========================================================================================>----------] 91% est: 1s
plot: [8,3] [=============================================================================================>--------] 92% est: 1s
plot: [8,4] [===============================================================================================>------] 94% est: 0s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [8,5] [================================================================================================>-----] 95% est: 0s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [8,6] [==================================================================================================>---] 97% est: 0s
plot: [8,7] [===================================================================================================>--] 98% est: 0s
plot: [8,8] [======================================================================================================]100% est: 0s
Based on this, look at the following for the second predictor:
prop_single_bagsmonthmod3a <- lm(average_price ~ is_organic + region + prop_single_bags,
data = avocado_train)
autoplot(mod3a)
summary(mod3a)
Call:
lm(formula = average_price ~ is_organic + region + prop_single_bags,
data = avocado_train)
Residuals:
Min 1Q Median 3Q Max
-1.01881 -0.18403 -0.02364 0.15186 1.49736
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.959189 0.012358 77.615 < 2e-16 ***
is_organicTRUE 0.566053 0.005081 111.408 < 2e-16 ***
regionGreat Lakes -0.090874 0.009709 -9.360 < 2e-16 ***
regionMidsouth -0.059627 0.009774 -6.101 1.08e-09 ***
regionNortheast 0.163889 0.009215 17.785 < 2e-16 ***
regionPlains 0.006730 0.013861 0.486 0.627
regionSouth Central -0.291893 0.011185 -26.096 < 2e-16 ***
regionSoutheast 0.007834 0.009873 0.793 0.428
regionTotalUS -0.115107 0.017988 -6.399 1.60e-10 ***
regionWest -0.110474 0.009326 -11.846 < 2e-16 ***
prop_single_bags 0.314894 0.011003 28.619 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2859 on 16414 degrees of freedom
Multiple R-squared: 0.4925, Adjusted R-squared: 0.4922
F-statistic: 1593 on 10 and 16414 DF, p-value: < 2.2e-16
Results:
mod3b <- lm(average_price ~ is_organic + region + month,
data = avocado_train)
autoplot(mod3b)
summary(mod3b)
Call:
lm(formula = average_price ~ is_organic + region + month, data = avocado_train)
Residuals:
Min 1Q Median 3Q Max
-1.11408 -0.16736 -0.01074 0.15318 1.52588
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.244203 0.007400 168.138 < 2e-16 ***
is_organicTRUE 0.496619 0.004309 115.257 < 2e-16 ***
regionGreat Lakes -0.133220 0.009266 -14.377 < 2e-16 ***
regionMidsouth -0.114344 0.009263 -12.345 < 2e-16 ***
regionNortheast 0.078579 0.008417 9.336 < 2e-16 ***
regionPlains -0.051701 0.013236 -3.906 9.42e-05 ***
regionSouth Central -0.352245 0.010598 -33.236 < 2e-16 ***
regionSoutheast -0.057636 0.009277 -6.213 5.34e-10 ***
regionTotalUS -0.164598 0.017296 -9.517 < 2e-16 ***
regionWest -0.161862 0.008835 -18.321 < 2e-16 ***
month.L 0.211679 0.007327 28.889 < 2e-16 ***
month.Q -0.153842 0.007436 -20.688 < 2e-16 ***
month.C -0.188233 0.007447 -25.275 < 2e-16 ***
month^4 -0.073374 0.007389 -9.930 < 2e-16 ***
month^5 -0.007427 0.007492 -0.991 0.32152
month^6 0.060358 0.007600 7.942 2.12e-15 ***
month^7 -0.003000 0.007431 -0.404 0.68644
month^8 0.029506 0.007468 3.951 7.82e-05 ***
month^9 0.014368 0.007684 1.870 0.06153 .
month^10 -0.024322 0.007717 -3.152 0.00163 **
month^11 0.012001 0.007749 1.549 0.12147
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2761 on 16404 degrees of freedom
Multiple R-squared: 0.527, Adjusted R-squared: 0.5264
F-statistic: 913.7 on 20 and 16404 DF, p-value: < 2.2e-16
anova(mod2b, mod3b)
Analysis of Variance Table
Model 1: average_price ~ is_organic + region
Model 2: average_price ~ is_organic + region + month
Res.Df RSS Df Sum of Sq F Pr(>F)
1 16415 1408.2
2 16404 1250.3 11 157.96 188.41 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Results:
Plot the residuals
avocado_resid <- avocado_train %>%
add_residuals(mod3b) %>%
select(-c(average_price, is_organic, region, month))
avocado_resid %>%
select(resid, everything()) %>%
ggpairs()
plot: [1,1] [=>----------------------------------------------------------------------------------------------------] 2% est: 0s
plot: [1,2] [===>--------------------------------------------------------------------------------------------------] 4% est: 2s
plot: [1,3] [=====>------------------------------------------------------------------------------------------------] 6% est: 2s
plot: [1,4] [=======>----------------------------------------------------------------------------------------------] 8% est: 2s
plot: [1,5] [=========>--------------------------------------------------------------------------------------------] 10% est: 2s
plot: [1,6] [===========>------------------------------------------------------------------------------------------] 12% est: 2s
plot: [1,7] [==============>---------------------------------------------------------------------------------------] 14% est: 2s
plot: [2,1] [================>-------------------------------------------------------------------------------------] 16% est: 2s
plot: [2,2] [==================>-----------------------------------------------------------------------------------] 18% est: 2s
plot: [2,3] [====================>---------------------------------------------------------------------------------] 20% est: 2s
plot: [2,4] [======================>-------------------------------------------------------------------------------] 22% est: 2s
plot: [2,5] [========================>-----------------------------------------------------------------------------] 24% est: 2s
plot: [2,6] [==========================>---------------------------------------------------------------------------] 27% est: 2s
plot: [2,7] [============================>-------------------------------------------------------------------------] 29% est: 2s
plot: [3,1] [==============================>-----------------------------------------------------------------------] 31% est: 2s
plot: [3,2] [================================>---------------------------------------------------------------------] 33% est: 2s
plot: [3,3] [==================================>-------------------------------------------------------------------] 35% est: 2s
plot: [3,4] [====================================>-----------------------------------------------------------------] 37% est: 2s
plot: [3,5] [=======================================>--------------------------------------------------------------] 39% est: 2s
plot: [3,6] [=========================================>------------------------------------------------------------] 41% est: 2s
plot: [3,7] [===========================================>----------------------------------------------------------] 43% est: 2s
plot: [4,1] [=============================================>--------------------------------------------------------] 45% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [4,2] [===============================================>------------------------------------------------------] 47% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [4,3] [=================================================>----------------------------------------------------] 49% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [4,4] [===================================================>--------------------------------------------------] 51% est: 2s
plot: [4,5] [=====================================================>------------------------------------------------] 53% est: 2s
plot: [4,6] [=======================================================>----------------------------------------------] 55% est: 2s
plot: [4,7] [=========================================================>--------------------------------------------] 57% est: 2s
plot: [5,1] [===========================================================>------------------------------------------] 59% est: 2s
plot: [5,2] [=============================================================>----------------------------------------] 61% est: 2s
plot: [5,3] [================================================================>-------------------------------------] 63% est: 1s
plot: [5,4] [==================================================================>-----------------------------------] 65% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [5,5] [====================================================================>---------------------------------] 67% est: 1s
plot: [5,6] [======================================================================>-------------------------------] 69% est: 1s
plot: [5,7] [========================================================================>-----------------------------] 71% est: 1s
plot: [6,1] [==========================================================================>---------------------------] 73% est: 1s
plot: [6,2] [============================================================================>-------------------------] 76% est: 1s
plot: [6,3] [==============================================================================>-----------------------] 78% est: 1s
plot: [6,4] [================================================================================>---------------------] 80% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [6,5] [==================================================================================>-------------------] 82% est: 1s
plot: [6,6] [====================================================================================>-----------------] 84% est: 1s
plot: [6,7] [======================================================================================>---------------] 86% est: 1s
plot: [7,1] [=========================================================================================>------------] 88% est: 0s
plot: [7,2] [===========================================================================================>----------] 90% est: 0s
plot: [7,3] [=============================================================================================>--------] 92% est: 0s
plot: [7,4] [===============================================================================================>------] 94% est: 0s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [7,5] [=================================================================================================>----] 96% est: 0s
plot: [7,6] [===================================================================================================>--] 98% est: 0s
plot: [7,7] [======================================================================================================]100% est: 0s
Based on this, look at the following for the fourth predictor:
prop_single_bagsyearln_total_bagsmod4a <- lm(average_price ~ is_organic + region + month + prop_single_bags,
data = avocado_train)
autoplot(mod4a)
summary(mod4a)
Call:
lm(formula = average_price ~ is_organic + region + month + prop_single_bags,
data = avocado_train)
Residuals:
Min 1Q Median 3Q Max
-1.04602 -0.17017 -0.01395 0.15441 1.53347
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.965397 0.011655 82.832 < 2e-16 ***
is_organicTRUE 0.566487 0.004780 118.507 < 2e-16 ***
regionGreat Lakes -0.090459 0.009125 -9.914 < 2e-16 ***
regionMidsouth -0.060191 0.009186 -6.552 5.84e-11 ***
regionNortheast 0.164601 0.008664 18.999 < 2e-16 ***
regionPlains 0.007735 0.013026 0.594 0.552665
regionSouth Central -0.289919 0.010513 -27.576 < 2e-16 ***
regionSoutheast 0.007950 0.009280 0.857 0.391667
regionTotalUS -0.116079 0.016903 -6.867 6.78e-12 ***
regionWest -0.109684 0.008765 -12.514 < 2e-16 ***
month.L 0.222057 0.007137 31.112 < 2e-16 ***
month.Q -0.136359 0.007258 -18.787 < 2e-16 ***
month.C -0.186582 0.007246 -25.749 < 2e-16 ***
month^4 -0.077521 0.007191 -10.781 < 2e-16 ***
month^5 -0.002371 0.007291 -0.325 0.744992
month^6 0.061071 0.007394 8.259 < 2e-16 ***
month^7 -0.004238 0.007230 -0.586 0.557812
month^8 0.034461 0.007268 4.741 2.14e-06 ***
month^9 0.016384 0.007477 2.191 0.028444 *
month^10 -0.026981 0.007509 -3.593 0.000328 ***
month^11 0.008597 0.007541 1.140 0.254241
prop_single_bags 0.316043 0.010389 30.420 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2686 on 16403 degrees of freedom
Multiple R-squared: 0.5522, Adjusted R-squared: 0.5517
F-statistic: 963.3 on 21 and 16403 DF, p-value: < 2.2e-16
anova(mod3b, mod4a)
Analysis of Variance Table
Model 1: average_price ~ is_organic + region + month
Model 2: average_price ~ is_organic + region + month + prop_single_bags
Res.Df RSS Df Sum of Sq F Pr(>F)
1 16404 1250.3
2 16403 1183.5 1 66.769 925.41 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Results:
mod4b <- lm(average_price ~ is_organic + region + month + year,
data = avocado_train)
autoplot(mod4b)
summary(mod4b)
Call:
lm(formula = average_price ~ is_organic + region + month + year,
data = avocado_train)
Residuals:
Min 1Q Median 3Q Max
-1.18315 -0.15607 -0.00215 0.14893 1.42981
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.206290 0.007770 155.259 < 2e-16 ***
is_organicTRUE 0.497309 0.004152 119.787 < 2e-16 ***
regionGreat Lakes -0.133280 0.008928 -14.928 < 2e-16 ***
regionMidsouth -0.113052 0.008925 -12.667 < 2e-16 ***
regionNortheast 0.079915 0.008110 9.854 < 2e-16 ***
regionPlains -0.050518 0.012754 -3.961 7.49e-05 ***
regionSouth Central -0.350466 0.010212 -34.320 < 2e-16 ***
regionSoutheast -0.056554 0.008939 -6.327 2.57e-10 ***
regionTotalUS -0.164547 0.016664 -9.874 < 2e-16 ***
regionWest -0.161476 0.008512 -18.970 < 2e-16 ***
month.L 0.224338 0.007451 30.109 < 2e-16 ***
month.Q -0.168833 0.007329 -23.037 < 2e-16 ***
month.C -0.189834 0.007180 -26.440 < 2e-16 ***
month^4 -0.071305 0.007157 -9.962 < 2e-16 ***
month^5 -0.014214 0.007268 -1.956 0.050514 .
month^6 0.056391 0.007335 7.688 1.57e-14 ***
month^7 -0.001947 0.007167 -0.272 0.785934
month^8 0.024216 0.007219 3.355 0.000797 ***
month^9 0.009515 0.007429 1.281 0.200263
month^10 -0.017405 0.007448 -2.337 0.019460 *
month^11 0.009856 0.007469 1.319 0.187026
year2016 -0.038593 0.005299 -7.283 3.40e-13 ***
year2017 0.137662 0.005285 26.048 < 2e-16 ***
year2018 0.087480 0.009366 9.340 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.266 on 16401 degrees of freedom
Multiple R-squared: 0.561, Adjusted R-squared: 0.5603
F-statistic: 911.1 on 23 and 16401 DF, p-value: < 2.2e-16
anova(mod3b, mod4b)
Analysis of Variance Table
Model 1: average_price ~ is_organic + region + month
Model 2: average_price ~ is_organic + region + month + year
Res.Df RSS Df Sum of Sq F Pr(>F)
1 16404 1250.3
2 16401 1160.5 3 89.818 423.14 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Results:
mod4c <- lm(average_price ~ is_organic + region + month + ln_total_bags,
data = avocado_train)
autoplot(mod4c)
summary(mod4c)
Call:
lm(formula = average_price ~ is_organic + region + month + ln_total_bags,
data = avocado_train)
Residuals:
Min 1Q Median 3Q Max
-1.05316 -0.16864 -0.01189 0.15567 1.48415
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.767770 0.019070 92.697 < 2e-16 ***
is_organicTRUE 0.352461 0.006423 54.876 < 2e-16 ***
regionGreat Lakes -0.150593 0.009047 -16.646 < 2e-16 ***
regionMidsouth -0.135357 0.009052 -14.953 < 2e-16 ***
regionNortheast 0.067748 0.008208 8.254 < 2e-16 ***
regionPlains -0.043875 0.012898 -3.402 0.000672 ***
regionSouth Central -0.327138 0.010360 -31.577 < 2e-16 ***
regionSoutheast -0.070486 0.009049 -7.790 7.12e-15 ***
regionTotalUS 0.020672 0.017971 1.150 0.250036
regionWest -0.162448 0.008607 -18.873 < 2e-16 ***
month.L 0.204078 0.007143 28.569 < 2e-16 ***
month.Q -0.151380 0.007245 -20.893 < 2e-16 ***
month.C -0.180654 0.007260 -24.883 < 2e-16 ***
month^4 -0.070975 0.007200 -9.858 < 2e-16 ***
month^5 -0.002157 0.007301 -0.295 0.767686
month^6 0.060066 0.007404 8.112 5.31e-16 ***
month^7 -0.001634 0.007240 -0.226 0.821448
month^8 0.030842 0.007276 4.239 2.26e-05 ***
month^9 0.015692 0.007487 2.096 0.036101 *
month^10 -0.024711 0.007519 -3.287 0.001016 **
month^11 0.009489 0.007550 1.257 0.208846
ln_total_bags -0.043995 0.001484 -29.655 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.269 on 16403 degrees of freedom
Multiple R-squared: 0.551, Adjusted R-squared: 0.5505
F-statistic: 958.7 on 21 and 16403 DF, p-value: < 2.2e-16
anova(mod3b, mod4c)
Analysis of Variance Table
Model 1: average_price ~ is_organic + region + month
Model 2: average_price ~ is_organic + region + month + ln_total_bags
Res.Df RSS Df Sum of Sq F Pr(>F)
1 16404 1250.3
2 16403 1186.6 1 63.62 879.42 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Results:
Test the model
predictions_test <- avocado_test %>%
add_predictions(mod4c) %>%
select(average_price, pred)
predictions_test
predictions_test <- predictions_test %>%
mutate(sq_err = (pred - average_price)^2)
mse_test <- mean(predictions_test$sq_err)
mse_test # normally this would be sqrt'd -> RMSE
[1] 0.08327954
sqrt(mse_test)
[1] 0.2885819
predictions_train <- avocado_train %>%
add_predictions(mod4c) %>%
select(average_price, pred)
predictions_train
predictions_train <- predictions_train %>%
mutate(sq_err = (pred - average_price) ^ 2)
mse_train <- mean(predictions_train$sq_err)
mse_train
[1] 0.07224616
sqrt(mse_train)
[1] 0.2687865
K-fold cross validation
library(caret)
Loading required package: lattice
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Attaching package: ‘caret’
The following object is masked from ‘package:purrr’:
lift
cv_10_fold <- trainControl(method = "cv",
number = 10,
savePredictions = TRUE)
avo_model <- train(average_price ~ is_organic + region + month + year,
data = avocado_train,
trControl = cv_10_fold,
method = "lm")
avo_model$pred
avo_model$resample
mean(avo_model$resample$RMSE)
[1] 0.2661194
mean(avo_model$resample$Rsquared)
[1] 0.5600208